Skip to content

Introduce a product classifier #6680

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 53 additions & 18 deletions kitsune/llm/questions/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,16 @@
from django.db import models
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

from kitsune.llm.questions.prompt import spam_parser, spam_prompt, topic_parser, topic_prompt
from kitsune.llm.questions.prompt import (
product_parser,
product_prompt,
spam_parser,
spam_prompt,
topic_parser,
topic_prompt,
)
from kitsune.llm.utils import get_llm
from kitsune.products.utils import get_taxonomy
from kitsune.products.utils import get_products, get_taxonomy

DEFAULT_LLM_MODEL = "gemini-2.5-flash-preview-04-17"
HIGH_CONFIDENCE_THRESHOLD = 75
Expand Down Expand Up @@ -39,32 +46,60 @@ def classify_question(question: "Question") -> dict[str, Any]:
}

spam_detection_chain = spam_prompt | llm | spam_parser
product_classification_chain = product_prompt | llm | product_parser
topic_classification_chain = topic_prompt | llm | topic_parser

def handle_spam(payload: dict[str, Any], spam_result: dict[str, Any]) -> dict[str, Any]:
"""Handle spam classification with potential product reclassification."""
confidence = spam_result.get("confidence", 0)
match confidence:
case _ if confidence >= HIGH_CONFIDENCE_THRESHOLD:
action = ModerationAction.SPAM
case _ if confidence > LOW_CONFIDENCE_THRESHOLD:
action = ModerationAction.FLAG_REVIEW
case _:
action = ModerationAction.NOT_SPAM

if not ((action == ModerationAction.SPAM) and spam_result.get("maybe_misclassified")):
return {"action": action, "product_result": {}}

payload["products"] = get_products(output_format="JSON")
product_result = product_classification_chain.invoke(payload)
new_product = product_result.get("product")

if new_product and new_product != payload["product"]:
payload["product"] = new_product
payload["topics"] = get_taxonomy(
new_product, include_metadata=["description", "examples"], output_format="JSON"
)
topic_result = topic_classification_chain.invoke(payload)
return {
"action": ModerationAction.NOT_SPAM,
"product_result": product_result,
"topic_result": topic_result,
}
else:
return {
"action": ModerationAction.SPAM,
"product_result": product_result,
}

def decision_lambda(payload: dict[str, Any]) -> dict[str, Any]:
spam_result: dict[str, Any] = payload["spam_result"]
confidence: int = spam_result.get("confidence", 0)
is_spam: bool = spam_result.get("is_spam", False)
result = {
"action": ModerationAction.NOT_SPAM,

base_result = {
"spam_result": spam_result,
"product_result": {},
"topic_result": {},
}

if is_spam:
match confidence:
case _ if confidence >= HIGH_CONFIDENCE_THRESHOLD:
result["action"] = ModerationAction.SPAM
case _ if (
confidence > LOW_CONFIDENCE_THRESHOLD
and confidence < HIGH_CONFIDENCE_THRESHOLD
):
result["action"] = ModerationAction.FLAG_REVIEW

if result["action"] == ModerationAction.NOT_SPAM:
result["topic_result"] = topic_classification_chain.invoke(payload)

return result
spam_handling = handle_spam(payload, spam_result)
return {**base_result, **spam_handling}

topic_result = topic_classification_chain.invoke(payload)
return {**base_result, "topic_result": topic_result}

pipeline = RunnablePassthrough.assign(spam_result=spam_detection_chain) | RunnableLambda(
decision_lambda
Expand Down
91 changes: 90 additions & 1 deletion kitsune/llm/questions/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
- Encourages illegal, unethical, or dangerous behavior.
- Promotes political views or propaganda unrelated to the product.
- Is extremely short (e.g., less than 10 words), overly vague, or the primary purpose of the question cannot be understood from the text.
- Intent or relevance to Mozilla's "{product}" cannot be determined.
- Its intent cannot be determined.
- Contains excessive random symbols, emojis, or gibberish text.
- Contains QR codes or links/images directing users off-site.
- Clearly unrelated to Mozilla's "{product}" product features, functionality or purpose.
Expand All @@ -29,11 +29,56 @@
- `0` = Extremely uncertain.
- `100` = Completely certain.
4. Provide a concise explanation supporting your decision.
5. **Determine if question was misclassified due to wrong product:** True only if this is a legitimate Mozilla support request that
doesn't relate to "{product}" but clearly relates to another Mozilla product.

# Response format
{format_instructions}
"""

PRODUCT_INSTRUCTIONS = """
# Role and Goal
You are a specialized product reclassification agent for Mozilla's support forums.
Your task is to evaluate user-submitted questions previously flagged as spam and determine
if they should instead be reassigned to a specific Mozilla product category.

# Available Mozilla Products
You MUST select exactly one product from the following JSON-formatted list if reassignment is appropriate:
- **title**: Name of the product.
- **description**: A short description of the product.

```json
{products}
```

# When to Reassign a Question
Reassign a question to a specific product ONLY if **all** of these criteria apply:
- The question explicitly mentions or clearly relates to the product's distinctive features or functionalities.
- The question includes technical terms, error messages, or workflows unique to the specific product.
- You are highly confident the original spam classification resulted from incorrect product selection.
- The content represents a legitimate support request, not promotional or spam content.

# When NOT to Reassign
Do NOT reassign the question if **any** of these criteria apply:
- The content is genuinely promotional, spam, inappropriate, or clearly unrelated to Mozilla products.
- You cannot confidently determine the relevant Mozilla product.
- The question equally involves multiple Mozilla products with no clear primary focus.
- The original spam classification appears correct, regardless of product selection.

# Task Instructions
Given a user-submitted question previously flagged as spam, strictly follow these steps:
1. **Carefully Evaluate** whether the question clearly relates to a specific Mozilla product.
2. **Spam Verification** - Confirm explicitly that the content is not promotional or actual spam.
3. **Determine Reassignment:** If the question meets **all** reassignment criteria, explicitly select the most appropriate product. Otherwise, do not reassign.
4. Indicate your **confidence** in your decision (0-100), with higher scores indicating stronger certainty:
- `0` = Extremely uncertain.
- `100` = Completely certain.
5. Provide a concise explanation (1–2 sentences) clearly supporting your decision.

# Response Format
{format_instructions}
"""

TOPIC_INSTRUCTIONS = """
# Role and goal
You are a content classification agent specialized in Mozilla's "{product}" product support forums.
Expand Down Expand Up @@ -100,6 +145,14 @@
type="str",
description="The reason for identifying the question as spam or not spam.",
),
ResponseSchema(
name="maybe_misclassified",
type="bool",
description=(
"True if this appears to be a legitimate Mozilla support request"
" that was flagged as spam solely due to incorrect product categorization."
),
),
)
)

Expand All @@ -119,6 +172,34 @@
)
)

product_parser = StructuredOutputParser.from_response_schemas(
(
ResponseSchema(
name="product",
type="str",
description=(
"The Mozilla product selected for reassignment or null if no reassignment"
" should be made."
),
),
ResponseSchema(
name="confidence",
type="int",
description=(
"An integer from 0 to 100 that indicates the level of confidence in the"
" product reassignment decision, with 0 representing the lowest confidence"
" and 100 the highest."
),
),
ResponseSchema(
name="reason",
type="str",
description="The reason for reassigning to the selected product "
" or for not reassigning.",
),
)
)


spam_prompt = ChatPromptTemplate(
(
Expand All @@ -134,3 +215,11 @@
("human", USER_QUESTION),
)
).partial(format_instructions=topic_parser.get_format_instructions())


product_prompt = ChatPromptTemplate(
(
("system", PRODUCT_INSTRUCTIONS),
("human", USER_QUESTION),
)
).partial(format_instructions=product_parser.get_format_instructions())
2 changes: 1 addition & 1 deletion kitsune/products/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json

from django.db.models import Prefetch, Q
import yaml
from django.db.models import Prefetch, Q

from kitsune.products.models import Product, Topic

Expand Down
88 changes: 59 additions & 29 deletions kitsune/questions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,41 +176,71 @@ def process_classification_result(
) -> None:
"""
Process the classification result from the LLM and take moderation action.
Handles spam, flag review, and updates to product and topic if suggested by the classifier.
"""
sumo_bot = Profile.get_sumo_bot()
action = result.get("action")
match action:
case ModerationAction.SPAM:
question.mark_as_spam(sumo_bot)
case ModerationAction.FLAG_REVIEW:

if action == ModerationAction.SPAM:
question.mark_as_spam(sumo_bot)
return
elif action == ModerationAction.FLAG_REVIEW:
flag_question(
question,
by_user=sumo_bot,
notes=(
"LLM flagged for manual review, for the following reason:\n"
f"{result.get('spam_result', {}).get('reason', '')}"
),
reason=FlaggedObject.REASON_SPAM,
)
return

product_result = result.get("product_result", {})
topic_result = result.get("topic_result", {})
new_product_title = product_result.get("product")
new_topic_title = topic_result.get("topic")

update_kwargs = {}

if (
new_product_title
and hasattr(question, "product")
and getattr(question.product, "title", None) != new_product_title
):
from kitsune.products.models import Product

try:
new_product = Product.objects.get(title=new_product_title)
except Product.DoesNotExist:
log.warning(
f"LLM suggested product '{new_product_title}' does not exist. Skipping product update."
)
else:
update_kwargs["product"] = new_product

if new_topic_title:
try:
topic = Topic.active.get(title=new_topic_title, visible=True)
except (Topic.DoesNotExist, Topic.MultipleObjectsReturned):
log.warning(
f"LLM suggested topic '{new_topic_title}' is invalid. Skipping topic update."
)
else:
update_kwargs["topic"] = topic

if update_kwargs:
question.save(**update_kwargs)
question.clear_cached_tags()
question.auto_tag()

if update_kwargs.get("topic"):
flag_question(
question,
by_user=sumo_bot,
notes=(
"LLM flagged for manual review, for the following reason:\n"
f"{result['spam_result']['reason']}"
f"LLM classified as {topic.title}, for the following reason:\n"
f"{topic_result.get('reason', '')}"
),
reason=FlaggedObject.REASON_SPAM,
status=FlaggedObject.FLAG_ACCEPTED,
)
case _:
if topic_title := result["topic_result"].get("topic"):
try:
topic = Topic.active.get(title=topic_title, visible=True)
except (Topic.DoesNotExist, Topic.MultipleObjectsReturned):
return
else:
flag_question(
question,
by_user=sumo_bot,
notes=(
"LLM classified as {topic.title}, for the following reason:\n"
f"{result['topic_result']['reason']}"
),
status=FlaggedObject.FLAG_ACCEPTED,
)
if question.topic:
question.tags.remove(question.topic.slug)
question.topic = topic
question.save()
question.tags.add(topic.slug)
question.clear_cached_tags()